Human Resources Analysis: Predict Attrition

Notebook 1: Exploratory Data Analysis

Enterprise Data Science Bootcamp | 2021

Post-Graduation in Enterprise Data Science & Analytics

Group TP2 C

Beatriz Crispim (m20201835)
Inês Resende (m20200844)
João Tiago Homem (m20201791)
Tiago Rodrigues (m20201771)

Table of Contents

  • 1. Import and check the dataset
  • 2. Data pre-processing and Understanding
    • 2.1. Describe data
    • 2.2 Statistical exploration
      • 2.2.1 Skewness
      • 2.2.2 Kurtosis
      • 2.2.3 Correlation Matrix
      • 2.2.4 Binning
    • 2.3 Visual exploration
      • 2.3.1 Attrition demographics
        • 2.3.1.1 Attrition
        • 2.3.1.2 Gender
        • 2.3.1.3 Age & Distance from home
        • 2.3.1.4 Marital Status and Education Field
        • 2.3.1.5 Business Travel
      • 2.3.2 Job demographics
        • 2.3.2.1 Job demographics Years with current manager, years in current role, years at the company and years since last promotion
        • 2.3.2.2 Years at company
        • 2.3.2.3 Department, Job Role , Business Travel, OverTime , Job Level
        • 2.3.2.4 Job Role
        • 2.3.2.5 OverTime
        • 2.3.2.6 Job Level
        • 2.3.2.7 Job Level vs Years at company
        • 2.3.2.8 Job Role vs Business Travel
        • 2.3.2.9 Job Role vs Job Level
      • 2.3.3 Employee Satisfaction
        • 2.3.3.1 Work Life Balance, job involvement
        • 2.3.3.2 Environment Satisfaction, Job Satisfaction, Relationship Satisfaction, Job Involvement, Work Life Balance
        • 2.3.3.3 Job Role vs Work life Balance
        • 2.3.3.4 Job Satisfaction vs Years at company
        • 2.3.3.5 Job Role vs Job Satisfaction
      • 2.3.4 Income and hours of work
        • 2.3.4.1 Job Role vs Montly Income
        • 2.3.4.2 Age vs Montly Income
      • 2.3.5 Performance and training relation
      • 2.3.6 Training Times Last Year
      • 2.3.7 Performance and last promotion/SalarayHike relation
        • 2.3.7.1 Age vs Years since last promotion
      • 2.3.8 Stock option level
        • 2.3.8.1 Age vs Stock option level
      • 2.3.9 Years at company
        • 2.3.9.1 Years at company vs Total working years
        • 2.3.9.2 Years at company vs Year since last promotion
  • 3. Data Transformation
    • 3.1 Transform categorical variables
  • 4. Segmentation
    • 4.1 Segmentação attrition='No'
      • 4.1.1 Job Role
      • 4.1.2 Stock option level
      • 4.1.4 Job involvement
      • 4.1.5 Business Travel
      • 4.1.6 OverTime
      • 4.1.7 Training times last year
    • 4.2 Segmentation by job Role
      • 4.2.1 Sales representative (considering attrition)
        • 4.2.1.1 Gender
        • 4.2.1.2 Overtime
        • 4.2.1.3 Montly Income
        • 4.2.1.4 Job Level
        • 4.2.1.5 Stock option level
        • 4.2.1.6 Satisfaction
        • 4.2.1.7 Business Travel
        • 4.2.1.8 Years in current Role
      • 4.2.2 Laboratory Technician (considering attrition)
        • 4.2.2.1 Montly Income
        • 4.2.2.2 Years in current role
        • 4.2.2.3 Years at company
        • 4.2.2.4 Stock option level
        • 4.2.2.5 Job Level
        • 4.2.2.6 Over Time
        • 4.2.2.7 Work Life balance
        • 4.2.2.8 Satisfaction
      • 4.2.3 Human resources
        • 4.2.3.1 Montly Income
        • 4.2.3.2 Over time
        • 4.2.3.3 Training times last year
        • 4.2.3.4 Stock option level
        • 4.2.3.5 Job Level
        • 4.2.3.6 Distance from home
        • 4.2.3.7 Satisfaction
        • 4.2.3.8 Percent Salary hike

Import libraries

In [444]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels as sm
import os
import shutil
from zipfile import ZipFile
import warnings
warnings.filterwarnings('ignore')
from pylab import cm
import matplotlib.font_manager
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from scipy import stats
import matplotlib.patches as mpatches
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import math

Load style

In [445]:
plt.rcParams['font.size'] = 12
plt.rcParams['mathtext.fontset'] = 'custom'
plt.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
plt.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
plt.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
plt.rcParams["text.usetex"] = False
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['xtick.direction'] = 'inout'
plt.rcParams['ytick.direction'] = 'inout'
plt.rcParams['xtick.major.size'] = 5.0
plt.rcParams['xtick.minor.size'] = 3.0
plt.rcParams['ytick.major.size'] = 5.0
plt.rcParams['ytick.minor.size'] = 3.0
plt.rcParams['figure.dpi'] = 100
plt.rcParams['lines.markersize'] = 5
plt.rcParams['lines.linewidth'] = 0.7
plt.rcParams['lines.linestyle'] = '--'
plt.rcParams['lines.marker'] = '.'
plt.rcParams['figure.figsize'] = 8,6
In [446]:
#Auxiliar function
def stats (lista):
    
    mean = '{:.2f}'.format(lista.mean())
    Q1 = '{:.2f}'.format(np.percentile(lista, 25))
    Q3 = '{:.2f}'.format(np.percentile(lista, 75))
    skew = '{:.2f}'.format(sp.stats.skew(lista) )
    return (mean, Q1, Q3, skew)
In [447]:
#Auxiliar function
def statss (lista):
    
    mean = '{:.2f}'.format(lista.median())
    Q1 = '{:.2f}'.format(np.percentile(lista, 25))
    Q3 = '{:.2f}'.format(np.percentile(lista, 75))
    skew = '{:.2f}'.format(sp.stats.skew(lista) )
    return (mean, Q1, Q3, skew)
In [448]:
def plots1 (var):
    
    collumn=df[var]
    maxim=collumn.max()
    minim=collumn.min()

    lista = df[df['Attrition'] == 'Yes']    

    fig, axes = plt.subplots(2,2, figsize = (5,6),gridspec_kw={'height_ratios': [1, 4]})
    g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 30)
    g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
    g2.set(xlabel=None)
    g2.set(title='Attrition = Yes')
    res = stats (lista[var])

    axes[1,0].text(0, -0.15, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=8,
           color= 'k')
    axes[1,0].text(0.5, -0.15, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=8,
           color= 'k')
    axes[1,0].text(0, -0.21, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=8,
           color= 'k')
    axes[1,0].text(0.5, -0.21, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=8,
           color= 'k')

    plt.xlim(minim,maxim)

    plt.xlabel(var)
    
    lista = df[df['Attrition'] == 'No']    

    g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 30)
    g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
    g2.set(xlabel=None)
    g2.set(title='Attrition = No')
    res = stats (lista[var])

    axes[1,1].text(0, -0.15, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=8,
           color= 'k')
    axes[1,1].text(0.5, -0.15, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=8,
           color= 'k')
    axes[1,1].text(0, -0.21, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=8,
           color= 'k')
    axes[1,1].text(0.5, -0.21, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=8,
           color= 'k')
    
    axes[1,1].set_ylabel('')  
    
    plt.xlabel(var)
    plt.xlim(minim,maxim)

    plt.show()

The goals are:

  • Descriptive Analytics: Find correlations between the different variables, possibility of clustering analysis, build nice visualizations that may help to get better insights for the analysis, check feature’s cardinality and analyze possible highly correlation features that may be removed and still achieve good results.
  • Predictive Analytics: Build classification models to predict the Attrition probability and the final classification given by the model.

1. Import and check the dataset

In [449]:
df = pd.read_csv(r'C:\Users\Tiago\Desktop\Human resources\HR_DS.csv')
In [450]:
print('Rows number before' ,df.shape[0])
df.drop_duplicates(inplace = True)
print('Rows number after removing duplicates' ,df.shape[0])
Rows number before 1470
Rows number after removing duplicates 1470
In [451]:
df.set_index('EmployeeNumber', inplace = True)

2. Data pre-processing and Understanding

In [452]:
df.head()
Out[452]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EnvironmentSatisfaction ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
EmployeeNumber
1 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 2 ... 1 80 0 8 0 1 6 4 0 5
2 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 3 ... 4 80 1 10 3 3 10 7 1 7
4 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
5 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 4 ... 3 80 0 8 3 3 8 7 3 0
7 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 1 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 34 columns

2.1 Describe data

In [453]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1470 entries, 1 to 2068
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                   1470 non-null   object
 15  JobSatisfaction           1470 non-null   int64 
 16  MaritalStatus             1470 non-null   object
 17  MonthlyIncome             1470 non-null   int64 
 18  MonthlyRate               1470 non-null   int64 
 19  NumCompaniesWorked        1470 non-null   int64 
 20  Over18                    1470 non-null   object
 21  OverTime                  1470 non-null   object
 22  PercentSalaryHike         1470 non-null   int64 
 23  PerformanceRating         1470 non-null   int64 
 24  RelationshipSatisfaction  1470 non-null   int64 
 25  StandardHours             1470 non-null   int64 
 26  StockOptionLevel          1470 non-null   int64 
 27  TotalWorkingYears         1470 non-null   int64 
 28  TrainingTimesLastYear     1470 non-null   int64 
 29  WorkLifeBalance           1470 non-null   int64 
 30  YearsAtCompany            1470 non-null   int64 
 31  YearsInCurrentRole        1470 non-null   int64 
 32  YearsSinceLastPromotion   1470 non-null   int64 
 33  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(25), object(9)
memory usage: 402.0+ KB
In [454]:
df.describe(include=[np.number]).T
Out[454]:
count mean std min 25% 50% 75% max
Age 1470.0 36.923810 9.135373 18.0 30.0 36.0 43.00 60.0
DailyRate 1470.0 802.485714 403.509100 102.0 465.0 802.0 1157.00 1499.0
DistanceFromHome 1470.0 9.192517 8.106864 1.0 2.0 7.0 14.00 29.0
Education 1470.0 2.912925 1.024165 1.0 2.0 3.0 4.00 5.0
EmployeeCount 1470.0 1.000000 0.000000 1.0 1.0 1.0 1.00 1.0
EnvironmentSatisfaction 1470.0 2.721769 1.093082 1.0 2.0 3.0 4.00 4.0
HourlyRate 1470.0 65.891156 20.329428 30.0 48.0 66.0 83.75 100.0
JobInvolvement 1470.0 2.729932 0.711561 1.0 2.0 3.0 3.00 4.0
JobLevel 1470.0 2.063946 1.106940 1.0 1.0 2.0 3.00 5.0
JobSatisfaction 1470.0 2.728571 1.102846 1.0 2.0 3.0 4.00 4.0
MonthlyIncome 1470.0 6502.931293 4707.956783 1009.0 2911.0 4919.0 8379.00 19999.0
MonthlyRate 1470.0 14313.103401 7117.786044 2094.0 8047.0 14235.5 20461.50 26999.0
NumCompaniesWorked 1470.0 2.693197 2.498009 0.0 1.0 2.0 4.00 9.0
PercentSalaryHike 1470.0 15.209524 3.659938 11.0 12.0 14.0 18.00 25.0
PerformanceRating 1470.0 3.153741 0.360824 3.0 3.0 3.0 3.00 4.0
RelationshipSatisfaction 1470.0 2.712245 1.081209 1.0 2.0 3.0 4.00 4.0
StandardHours 1470.0 80.000000 0.000000 80.0 80.0 80.0 80.00 80.0
StockOptionLevel 1470.0 0.793878 0.852077 0.0 0.0 1.0 1.00 3.0
TotalWorkingYears 1470.0 11.279592 7.780782 0.0 6.0 10.0 15.00 40.0
TrainingTimesLastYear 1470.0 2.799320 1.289271 0.0 2.0 3.0 3.00 6.0
WorkLifeBalance 1470.0 2.761224 0.706476 1.0 2.0 3.0 3.00 4.0
YearsAtCompany 1470.0 7.008163 6.126525 0.0 3.0 5.0 9.00 40.0
YearsInCurrentRole 1470.0 4.229252 3.623137 0.0 2.0 3.0 7.00 18.0
YearsSinceLastPromotion 1470.0 2.187755 3.222430 0.0 0.0 1.0 3.00 15.0
YearsWithCurrManager 1470.0 4.123129 3.568136 0.0 2.0 3.0 7.00 17.0
In [455]:
df.describe(include=[object]).T
Out[455]:
count unique top freq
Attrition 1470 2 No 1233
BusinessTravel 1470 3 Travel_Rarely 1043
Department 1470 3 Research & Development 961
EducationField 1470 6 Life Sciences 606
Gender 1470 2 Male 882
JobRole 1470 9 Sales Executive 326
MaritalStatus 1470 3 Married 673
Over18 1470 1 Y 1470
OverTime 1470 2 No 1054

2.2 Statistical exploration

2.2.1 Skewness

In [456]:
sk = df.skew()
sk 
Out[456]:
Age                         0.413286
DailyRate                  -0.003519
DistanceFromHome            0.958118
Education                  -0.289681
EmployeeCount               0.000000
EnvironmentSatisfaction    -0.321654
HourlyRate                 -0.032311
JobInvolvement             -0.498419
JobLevel                    1.025401
JobSatisfaction            -0.329672
MonthlyIncome               1.369817
MonthlyRate                 0.018578
NumCompaniesWorked          1.026471
PercentSalaryHike           0.821128
PerformanceRating           1.921883
RelationshipSatisfaction   -0.302828
StandardHours               0.000000
StockOptionLevel            0.968980
TotalWorkingYears           1.117172
TrainingTimesLastYear       0.553124
WorkLifeBalance            -0.552480
YearsAtCompany              1.764529
YearsInCurrentRole          0.917363
YearsSinceLastPromotion     1.984290
YearsWithCurrManager        0.833451
dtype: float64

2.2.2 Kurtosis

In [457]:
kt = df.kurt()
kt 
Out[457]:
Age                        -0.404145
DailyRate                  -1.203823
DistanceFromHome           -0.224833
Education                  -0.559115
EmployeeCount               0.000000
EnvironmentSatisfaction    -1.202521
HourlyRate                 -1.196398
JobInvolvement              0.270999
JobLevel                    0.399152
JobSatisfaction            -1.222193
MonthlyIncome               1.005233
MonthlyRate                -1.214956
NumCompaniesWorked          0.010214
PercentSalaryHike          -0.300598
PerformanceRating           1.695939
RelationshipSatisfaction   -1.184814
StandardHours               0.000000
StockOptionLevel            0.364634
TotalWorkingYears           0.918270
TrainingTimesLastYear       0.494993
WorkLifeBalance             0.419460
YearsAtCompany              3.935509
YearsInCurrentRole          0.477421
YearsSinceLastPromotion     3.612673
YearsWithCurrManager        0.171058
dtype: float64
In [458]:
#columns with kurt=0 
df=df.drop(columns=['EmployeeCount', 'StandardHours'])
In [459]:
df['Attrition_aux'] = np.where((df['Attrition']=='Yes'),1,0)
In [460]:
df.head()
Out[460]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender ... RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Attrition_aux
EmployeeNumber
1 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 2 Female ... 1 0 8 0 1 6 4 0 5 1
2 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 3 Male ... 4 1 10 3 3 10 7 1 7 0
4 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 4 Male ... 2 0 7 3 3 0 0 0 0 1
5 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 4 Female ... 3 0 8 3 3 8 7 3 0 0
7 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 Male ... 4 1 6 3 3 2 2 2 2 0

5 rows × 33 columns

2.2.3 Correlation Matrix

In [461]:
df_corr = df.corr(method = 'spearman')
figure = plt.figure(figsize=(16,10))
g=sns.heatmap(df_corr, annot=True, fmt = '.1g')
In [462]:
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 80
df_aux = df[['YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager']].copy()

df_corr = df_aux.corr(method = 'spearman')
figure = plt.figure(figsize=(5,5))
g=sns.heatmap(df_corr, annot=True, fmt = '.1g')

2.2.4 Binning

In [463]:
df['age_bins'] = pd.cut(x=df['Age'], bins=[18, 25, 30, 35,40,45,50,55,60])
df['HourlyRate_bins'] = pd.cut(x=df['HourlyRate'], bins=[30, 40, 50, 60,70,80,90,100])

2.3 Visual exploration

2.3.1 Attrition demographics

2.3.1.1 Attrition

In [464]:
plt.rcParams['font.size'] = 11
plt.rcParams['figure.dpi'] = 80
Total=df['Attrition'].value_counts().sum()

values = df['Attrition'].value_counts().keys().tolist()
counts = df['Attrition'].value_counts().tolist()

sns.barplot(x=values,y=counts, palette="deep")

ax = plt.gca()

position = 0
for value in counts:
        ax.text(position, value+10, '{:.1f}%'.format(value / Total * 100) +'  (' + str(value)  +')', color = 'black', ha = 'center', size = 'large' )
        
        position = position + 1

plt.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
plt.xlabel('Attrition')
plt.ylabel('Exmployees count')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

2.3.1.2 Gender

In [465]:
var='Gender'
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 110
g = sns.catplot(x="Gender", col="Attrition", col_wrap=4,
                data=df,
                kind="count", height=2.5, aspect=.8, palette='inferno')
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+0.25, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+0.25, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium')
    
subset_df = df[df['Attrition'] == 'Yes']    

soma = subset_df[var].value_counts().sum()

ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+0.2, 
            p.get_height() /2.5, "{:.3}".format((p.get_height()/soma)*100) + " %", 
            #Used to format it K representation
            color='w', 
            rotation='horizontal', 
            size='medium')    
    
subset_df = df[df['Attrition'] == 'No']    

soma = subset_df[var].value_counts().sum()
    
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+0.1, 
            p.get_height() /2.5, "{:.4}".format((p.get_height()/soma)*100) + " %", 
            #Used to format it K representation
            color='w', 
            rotation='horizontal', 
            size='medium')
In [466]:
variaveis = ['Gender'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6, 5), dpi=100)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax=sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=subset_df,edgecolor='k'      )
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        plt.text(txt_x+p.get_width()/ 2.0,txt_y+0.05,txt, ha='center', va='bottom',fontsize='13')

        rects = ax.patches
      
    plt.ylabel('Percentage of employees in attrition ')
    plt.title(var)
    
plt.tight_layout()
plt.show()   

2.3.1.3 Age & Distance from home

In [467]:
variaveis = ['Age','DistanceFromHome'] 

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=100)

for i,var in enumerate(variaveis):
           
    sns.histplot(df[df['Attrition'] == 'No'][var],ax = axes[i], bins = 20,color='darkgreen').set_title(str(var)+ ' (Attrition= No)')
    
    res = stats (df[df['Attrition'] == 'No'][var])   
    
    axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')          
     
for i in range (1,2):
    axes[i].set_ylabel('')  
    axes[i].set_xlabel('')  
for i in range (0,2):   
    axes[i].grid(color='grey', linestyle='-.', linewidth=0.5)    

plt.tight_layout()

plt.show()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=300)

for i,var in enumerate(variaveis):
            
    sns.histplot(df[df['Attrition'] == 'Yes'][var],ax = axes[i], bins = 20,color='darkred').set_title(str(var)+ ' (Attrition= Yes)')
    
    res = stats (df[df['Attrition'] == 'Yes'][var])   
    
    axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=12,
           color= 'k')      
    for i in range (0,2):   
        axes[i].grid(color='grey', linestyle='-.', linewidth=0.7)    

for i in range (1,2):
    axes[i].set_ylabel('')  
    axes[i].set_xlabel('')  

plt.tight_layout()

plt.show()
In [468]:
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 100
plots1('Age')

2.3.1.4 Marital Status and Education Field

In [469]:
plt.rcParams['font.size'] = 10
variaveis = ['EducationField','MaritalStatus'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=variaveis_len, figsize=(12, 3), dpi=100)
for i,var in enumerate(variaveis):

    df_aux = (df.groupby(['Attrition'])[var]
                         .value_counts(normalize=True)
                         .rename('percentage')
                         .mul(100)
                         .reset_index()
                         )
    ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set2', data=df_aux,ax = axes[i])
    ax.set_ylim(0,60)
    ax.set_ylabel('Percentage of employees (%)')
    ax.set_xlabel('')
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 

    for p in axes[i].patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.3,txt, ha='center', va='bottom')

        rects = ax.patches
In [470]:
variaveis = ['EducationField','MaritalStatus'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=variaveis_len, figsize=(12, 3), dpi=300)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=subset_df,edgecolor='k',
                     ax = axes[i])
    
    for i, v in enumerate(subset_counts):
    
        ax.text(v, i+0.1, str(v) , color='k', fontweight='bold',fontsize=12,ha='right')
      
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show()   
In [471]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['MaritalStatus'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(5, 3), dpi=100)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=subset_df,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    
    for i, v in enumerate(subset_counts):
    
        ax.text(v, i+0.1, str(v) + " %", color='k', fontweight='bold',fontsize=12,ha='right')
      
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show()   

2.3.1.5 Business Travel

In [472]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['BusinessTravel'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,4), dpi=80)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=df_new,edgecolor='k',
                     order=df_new.sort_values('subset_counts',ascending = False).subset_values)
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_width().round(2)) 
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+2,txt_y+0.5,txt + " %", ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_xlim(0,35)
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show()   

2.3.2 Job demographics

2.3.2.1 Job demographics Years with current manager, years in current role, years at the company and years since last promotion

In [473]:
plt.rcParams['font.size'] = 7
variaveis = ['YearsWithCurrManager', 'YearsInCurrentRole', 'YearsAtCompany','YearsSinceLastPromotion'] 

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(12, 6), dpi=300)

for i,var in enumerate(variaveis):
            
    sns.histplot(df[df['Attrition'] == 'No'][var],ax = axes[i], bins = 20,color='darkgreen').set_title(str(var)+ ' (Attrition= No)')
    
    res = stats (df[df['Attrition'] == 'No'][var])   
    
    axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')   
    axes[i].grid(color='grey', linestyle=':', linewidth=0.5)
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
     
for i in range (1,3):
    axes[i].set_ylabel('')  

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(12, 6), dpi=300)

for i,var in enumerate(variaveis):
            
    sns.histplot(df[df['Attrition'] == 'Yes'][var],ax = axes[i], bins = 20,color='darkred').set_title(str(var)+ ' (Attrition= Yes)')
    
    res = stats (df[df['Attrition'] == 'Yes'][var])   
    
    axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=10,
           color= 'k')     
    axes[i].grid(color='grey', linestyle=':', linewidth=0.5)
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)
     
for i in range (1,3):
    axes[i].set_ylabel('')  

plt.tight_layout()
plt.show()

2.3.2.2 Years at company

In [474]:
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 100
plots1('YearsAtCompany')

2.3.2.3 Department, Job Role , Business Travel, OverTime , Job Level

In [475]:
variaveis = ['Department','JobRole','OverTime','BusinessTravel','JobLevel'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(8, 20), dpi=100)
for i,var in enumerate(variaveis):

    df_aux = (df.groupby(['Attrition'])[var]
                         .value_counts(normalize=True)
                         .rename('percentage')
                         .mul(100)
                         .reset_index()
                         )
    ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set1', data=df_aux,ax = axes[i])
#     ax.set_ylim(0,100)
    ax.set_ylabel('Percentage of employees (%)')
    ax.set_xlabel('')
    ax.set_title(var)
    ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
    _ = plt.setp(ax.get_xticklabels(), rotation=20) 

    for p in axes[i].patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.1,txt, ha='center', va='bottom')

        rects = ax.patches
       
plt.tight_layout()
plt.show()       
In [476]:
plt.rcParams['font.size'] = 8
variaveis = ['Department','JobRole','OverTime','BusinessTravel'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(6, 8), dpi=200)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=subset_df,edgecolor='k',
                     ax = axes[i])
    
    for i, v in enumerate(subset_counts):
    
        ax.text(v, i+0.1, str(v) , color='w', fontweight='bold',fontsize=7,ha='right')
      
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show()      

2.3.2.4 Job Role

In [477]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobRole'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=df_new,edgecolor='k',
                     order=df_new.sort_values('subset_counts',ascending = False).subset_values)
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_width().round(2)) 
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+3.5,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_xlim(0,50)
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show()   

2.3.2.5 OverTime

In [478]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['OverTime'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(4,3), dpi=80)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=df_new,edgecolor='k',
                     order=df_new.sort_values('subset_counts',ascending = False).subset_values)
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    
    for p in ax.patches:
        txt = str(p.get_width().round(2)) 
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+7
                 ,txt_y+0.55,txt + " %", ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_xlim(0,50)
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show() 

2.3.2.6 Job Level

In [479]:
variaveis = ['JobLevel'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6, 5), dpi=100)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax=sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=subset_df,edgecolor='k'      )
    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        plt.text(txt_x+p.get_width()/ 2.0,txt_y+0.02,txt + ' %', ha='center', va='bottom',fontsize='13')

        rects = ax.patches
      
    plt.ylabel('Percentage of employees in attrition ')
    plt.title(var)
    
plt.tight_layout()
plt.show()    

2.3.2.7 Job Level and Years at company

In [480]:
plt.figure(figsize=(7,5),dpi=100)
box_plot = sns.boxplot(x="JobLevel", y="YearsAtCompany", data=df,palette="crest",hue='Attrition')

medians = df.groupby(['JobLevel'])['YearsAtCompany'].median()
vertical_offset =df['YearsAtCompany'].median() * 0.1 

# for xtick in box_plot.get_xticks():
#     box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
#             horizontalalignment='center',size='medium',color='w',weight='semibold')
plt.xlabel("JobLevel")

plt.ylabel("YearsAtCompany")
plt.show()

2.3.2.8 Job Role and Business Travel

In [481]:
var1='JobRole'
var2='BusinessTravel'

plt.rcParams['font.size'] = 10

df_aux_YES =  df.loc[df['Attrition'] == 'Yes'] 

df_var1_vs_var2= pd.crosstab(df_aux_YES.JobRole, df_aux_YES.BusinessTravel ,normalize='index')*100

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5), dpi=100)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10) 
axes[0].set_ylabel(var1, fontsize = 10) 

df_aux_No =  df.loc[df['Attrition'] == 'No'] 

df_var1_vs_var3= pd.crosstab(df_aux_No.JobRole, df_aux_No.BusinessTravel,normalize='index')*100

sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu"  ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10) 
axes[1].set_ylabel('') 

plt.tight_layout()
plt.show()   

2.3.2.9 Job Role vs Job Level

In [482]:
var1='JobRole'
var2='JobLevel'

df_aux_YES =  df.loc[df['Attrition'] == 'Yes'] 

df_var1_vs_var2= pd.crosstab(df_aux_YES.JobRole, df_aux_YES.JobLevel ,normalize='index')*100

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), dpi=100)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10) 
axes[0].set_ylabel(var1, fontsize = 10) 

df_aux_No =  df.loc[df['Attrition'] == 'No'] 

df_var1_vs_var3= pd.crosstab(df_aux_No.JobRole, df_aux_No.JobLevel,normalize='index')*100

sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu"  ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10) 
axes[1].set_ylabel('') 

plt.tight_layout()
plt.show()  

2.3.3 Employee Satisfaction

2.3.3.1 Work Life Balance, job involvement

In [483]:
variaveis = ['JobInvolvement','WorkLifeBalance'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(4, 6), dpi=120)
for i,var in enumerate(variaveis):

    df_aux = (df.groupby(['Attrition'])[var]
                         .value_counts(normalize=True)
                         .rename('percentage')
                         .mul(100)
                         .reset_index()
                         )
    ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set2', data=df_aux,ax = axes[i])
#     ax.set_ylim(0,100)
    ax.set_ylabel('Percentage of employees (%)')
    ax.set_xlabel('')
    ax.set_title(var)
    ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 

    for p in axes[i].patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.1,txt, ha='center', va='bottom')

        rects = ax.patches
        sns.despine(ax=axes[i], offset=5)

plt.tight_layout()
plt.show()       

2.3.3.2 Environment Satisfaction, Job Satisfaction, Relationship Satisfaction, Job Involvement, Work Life Balance

In [484]:
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance'] 

plt.rcParams['font.size'] = 8
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(5, 13), dpi=120)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax=sns.barplot(x=subset_values, y=subset_counts, color='green', data=subset_df,edgecolor='k' ,ax=axes[i]     )
    
    for p in axes[i].patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        axes[i].text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt, ha='center', va='top',fontsize=10,color='w')

        rects = ax.patches

    ax.set_ylabel('Percentage of employees in attrition ')
    ax.set_title(var)
    
plt.tight_layout()
plt.show()  

2.3.3.3 Job Role vs Work life Balance

In [485]:
var1='JobLevel'
var2='WorkLifeBalance'
plt.rcParams['font.size'] = 10

df_aux_YES =  df.loc[df['Attrition'] == 'Yes'] 

df_var1_vs_var2= pd.crosstab(df_aux_YES.JobLevel, df_aux_YES.WorkLifeBalance ,normalize='index')*100

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(6, 3), dpi=120)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10) 
axes[0].set_ylabel(var1, fontsize = 10) 

df_aux_No =  df.loc[df['Attrition'] == 'No'] 

df_var1_vs_var3= pd.crosstab(df_aux_No.JobLevel, df_aux_No.WorkLifeBalance,normalize='index')*100

sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu"  ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10) 
axes[1].set_ylabel('') 

plt.tight_layout()
plt.show()  

2.3.3.4 Job Satisfaction vs Years at company

In [486]:
plt.figure(figsize=(6,5),dpi=100)
box_plot = sns.boxplot(x="JobSatisfaction", y="YearsAtCompany", data=df,palette="crest",hue='Attrition')

medians = df.groupby(['JobSatisfaction'])['YearsAtCompany'].median()
vertical_offset =df['YearsAtCompany'].median() * 0.1 

# for xtick in box_plot.get_xticks():
#     box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
#             horizontalalignment='center',size='medium',color='w',weight='semibold')
plt.xlabel("JobSatisfaction")

plt.ylabel("YearsAtCompany")
plt.show()

2.3.3.5 Job Role vs Job Satisfaction

In [487]:
var1='JobRole'
var2='JobSatisfaction'

df_aux_YES =  df.loc[df['Attrition'] == 'Yes'] 

df_var1_vs_var2= pd.crosstab(df_aux_YES.JobRole, df_aux_YES.JobSatisfaction ,normalize='index')*100

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), dpi=150)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10) 
axes[0].set_ylabel(var1, fontsize = 10) 

df_aux_No =  df.loc[df['Attrition'] == 'No'] 

df_var1_vs_var3= pd.crosstab(df_aux_No.JobRole, df_aux_No.JobSatisfaction,normalize='index')*100

sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu"  ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10) 
axes[1].set_ylabel('') 

plt.tight_layout()
plt.show()  

2.3.4 Income and hours of work

In [488]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['MonthlyIncome','PercentSalaryHike'] 

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

for i,var in enumerate(variaveis):
           
    sns.histplot(df[df['Attrition'] == 'No'][var],ax = axes[i], bins = 20,color='darkgreen').set_title(str(var)+ ' (Attrition= No)')
    
    res = stats (df[df['Attrition'] == 'No'][var])   
    
    axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')          
     
for i in range (1,2):
    axes[i].set_ylabel('')
    
for i in range (0,2):   
    axes[i].grid(color='grey', linestyle='-.', linewidth=0.5)    
    sns.despine(ax=axes[i], offset=5)
    axes[i].set_xlabel('')

plt.tight_layout()

plt.show()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=300)

for i,var in enumerate(variaveis):
            
    sns.histplot(df[df['Attrition'] == 'Yes'][var],ax = axes[i], bins = 20,color='darkred').set_title(str(var)+ ' (Attrition= Yes)')
    
    res = stats (df[df['Attrition'] == 'Yes'][var])   
    
    axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')
    axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=13,
           color= 'k')      
    for i in range (0,2):   
        axes[i].grid(color='grey', linestyle='-.', linewidth=0.7)  
        axes[i].set_xlabel('')
        
        sns.despine(ax=axes[i], offset=5)
     
for i in range (1,2):
    axes[i].set_ylabel('')  
    
plt.tight_layout()

plt.show()

2.3.4.1 Job Role vs Montly Income

In [490]:
fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(8, 5), dpi=100)
ax = sns.boxplot(x="JobRole", y="MonthlyIncome", data=df,palette="GnBu",hue='Attrition')

medians = df.groupby(['JobRole'])['MonthlyIncome'].median()
vertical_offset =df['MonthlyIncome'].median() * 0.1 
ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
_ = plt.setp(ax.get_xticklabels(), rotation=20,ha='right') 

plt.xlabel("JobRole")

plt.ylabel("MonthlyIncome")

plt.show()

2.3.4.2 Age vs Montly Income

In [491]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,6))
box_plot = sns.boxplot(x="age_bins", y="MonthlyIncome", data=df,palette="crest",hue='Attrition')

medians = df.groupby(['age_bins'])['MonthlyIncome'].median()
vertical_offset =df['MonthlyIncome'].median() * 0.1 

# for xtick in box_plot.get_xticks():
#     box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
#             horizontalalignment='center',size='medium',color='w',weight='semibold')
plt.xlabel("Age group")
plt.title ("Montly Income per age group")
plt.ylabel("Montly Income")
plt.show()

2.3.5 Performance and training relation

In [492]:
variaveis = ['PerformanceRating', 'TrainingTimesLastYear'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=2, figsize=(10, 5), dpi=100)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax=sns.barplot(x=subset_values, y=subset_counts, color='green', data=subset_df,edgecolor='k' ,ax=axes[i]     )
    
    for p in axes[i].patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        axes[i].text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt, ha='center', va='top',fontsize=14,color='w')

        rects = ax.patches

    ax.set_ylabel('Percentage of employees in attrition ')
    ax.set_title(var)
    sns.despine(ax=ax, offset=5)

plt.tight_layout()
plt.show()     

2.3.6 Training Times Last Year

In [493]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['TrainingTimesLastYear'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=df_new,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt +  ' %', ha='center', va='top',fontsize=16,color='k')
    
    ax.set_ylabel('Percentage of employees in attrition ')
    
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 
    
plt.tight_layout()
plt.show()   

2.3.7 Performance and last promotion/SalarayHike relation

In [494]:
plt.figure(figsize=(10,6))
box_plot = sns.boxplot(x="PerformanceRating", y="PercentSalaryHike", data=df,palette="Set2",hue='Attrition')

plt.ylabel("PercentSalaryHike")

plt.xlabel("PerformanceRating")
plt.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
plt.show()

2.3.7.1 Age vs Years since last promotion

In [495]:
plt.figure(figsize=(10,6))
plt.rcParams['font.size'] = 13
plt.rcParams['figure.dpi'] = 100
box_plot = sns.boxplot(y="age_bins", x="YearsSinceLastPromotion", data=df,palette="Set3",hue='Attrition')

plt.ylabel("Age group")
plt.title ("YearsSinceLastPromotion per age group")
plt.xlabel("YearsSinceLastPromotion")
plt.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
plt.show()

2.3.8 Stock option level

2.3.8.1 Age vs Stock option level

In [496]:
variaveis = ['StockOptionLevel','age_bins'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(6, 10), dpi=100)
for i,var in enumerate(variaveis):

    df_aux = (df.groupby(['Attrition'])[var]
                         .value_counts(normalize=True)
                         .rename('percentage')
                         .mul(100)
                         .reset_index()
                         )
    ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set1', data=df_aux,ax = axes[i])
#     ax.set_ylim(0,100)
    ax.set_ylabel('Percentage of employees (%)')
    ax.set_xlabel('')
    ax.set_title(var)
    ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
    
    for p in axes[i].patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.1,txt, ha='center', va='bottom')
        _ = plt.setp(ax.get_xticklabels(), rotation=20,ha='right') 
        
        rects = ax.patches

plt.tight_layout()
plt.show()   
In [497]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['StockOptionLevel'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=df_new,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightblue')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt +  ' %', ha='center', va='top',fontsize=16,color='k')
    
    ax.set_ylabel('Percentage of employees in attrition ')
    
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 
    
plt.tight_layout()
plt.show()   

2.3.9 Years at company

In [498]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['YearsAtCompany'] 

Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8, 6), dpi=100)
for i,var in enumerate(variaveis):

    subset_df = df[df['Attrition'] == 'Yes']    

    subset_counts = subset_df[var].value_counts().tolist()
    subset_values = subset_df[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for x in subset_values:
        total_counts.append((df[var].values == x).sum())
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/total_counts[k]*100 
        subset_counts[k]=round(subset_counts[k],1)
        
    ax=sns.barplot(x=subset_values, y=subset_counts, color='red', data=subset_df,edgecolor='k' ,alpha=0.8    )
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt, ha='center', va='bottom',fontsize=9,color='k')

        rects = ax.patches

    ax.set_ylabel('Percentage of employees in attrition ')
    ax.set_title(var)
    sns.despine(ax=ax, offset=5)  
    
plt.tight_layout()
plt.show()       

2.3.9.1 Years at company vs Total working years

In [499]:
plt.figure(figsize=(8,4))
sns.scatterplot(x = 'YearsAtCompany', y= 'TotalWorkingYears',hue='Attrition', data = df  ,palette ='flare')
plt.xlabel("YearsAtCompany")
plt.ylabel("TotalWorkingYears")
# plt.legend(loc='lower left', bbox_to_anchor=(0.6,0.5))
plt.legend(shadow=True, fancybox=True, ncol = 5, title="Attrition")
# plt.ylim(top=4000)
plt.show()

2.3.9.2 Years at company vs Year since last promotion

In [501]:
plt.figure(figsize=(8,4))
sns.scatterplot(x = 'YearsAtCompany', y= 'YearsSinceLastPromotion',hue='Attrition', data = df  ,palette ='flare')
plt.xlabel("YearsAtCompany")
plt.ylabel("YearsSinceLastPromotion")
# plt.legend(loc='lower left', bbox_to_anchor=(0.6,0.5))
plt.legend(shadow=True, fancybox=True, ncol = 5, title="Attrition")
# plt.ylim(top=4000)
plt.show()

3 . Data Transformation

In [502]:
df_transf=df.copy()
In [503]:
df_transf.describe(include=[object]).T
Out[503]:
count unique top freq
Attrition 1470 2 No 1233
BusinessTravel 1470 3 Travel_Rarely 1043
Department 1470 3 Research & Development 961
EducationField 1470 6 Life Sciences 606
Gender 1470 2 Male 882
JobRole 1470 9 Sales Executive 326
MaritalStatus 1470 3 Married 673
Over18 1470 1 Y 1470
OverTime 1470 2 No 1054

3.1 Transform categorical variables

In [504]:
df_transf['OverTime_2'] = df_transf ['OverTime'].replace('No',0).replace('Yes',1)
df_transf['Attrition_2'] = np.where((df['Attrition']=='Yes'),1,0)
df_transf['Single'] = np.where((df['MaritalStatus']=='Single'),1,0)
df_transf['Travel'] = np.where((df['BusinessTravel']=='Non-Travel'),0,1)
# df_transf2=df_transf.drop(['OverTime', 'Attrition','MaritalStatus','BusinessTravel','Department',
#                 'EducationField','Gender','Over18','Attrition_aux','age_bins','HourlyRate_bins'], axis=1)
In [505]:
df_dummies = pd.get_dummies(df_transf)
df_dummies
Out[505]:
Age DailyRate DistanceFromHome Education EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome ... age_bins_(45, 50] age_bins_(50, 55] age_bins_(55, 60] HourlyRate_bins_(30, 40] HourlyRate_bins_(40, 50] HourlyRate_bins_(50, 60] HourlyRate_bins_(60, 70] HourlyRate_bins_(70, 80] HourlyRate_bins_(80, 90] HourlyRate_bins_(90, 100]
EmployeeNumber
1 41 1102 1 2 2 94 3 2 4 5993 ... 0 0 0 0 0 0 0 0 0 1
2 49 279 8 1 3 61 2 2 2 5130 ... 1 0 0 0 0 0 1 0 0 0
4 37 1373 2 2 4 92 2 1 3 2090 ... 0 0 0 0 0 0 0 0 0 1
5 33 1392 3 4 4 56 3 1 3 2909 ... 0 0 0 0 0 1 0 0 0 0
7 27 591 2 1 1 40 3 1 2 3468 ... 0 0 0 1 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2061 36 884 23 2 3 41 4 2 4 2571 ... 0 0 0 0 1 0 0 0 0 0
2062 39 613 6 1 4 42 2 3 1 9991 ... 0 0 0 0 1 0 0 0 0 0
2064 27 155 4 3 2 87 4 2 2 6142 ... 0 0 0 0 0 0 0 0 1 0
2065 49 1023 2 3 4 63 2 2 2 5390 ... 1 0 0 0 0 0 1 0 0 0
2068 34 628 8 3 2 82 4 2 3 4404 ... 0 0 0 0 0 0 0 0 1 0

1470 rows × 74 columns

In [506]:
scaler = MinMaxScaler().fit(df_dummies)
scaled_df = scaler.transform(df_dummies)
In [507]:
cols = df_dummies.columns
scaled_df = pd.DataFrame(scaled_df, columns = cols)
scaled_df.head().T
Out[507]:
0 1 2 3 4
Age 0.547619 0.738095 0.452381 0.357143 0.214286
DailyRate 0.715820 0.126700 0.909807 0.923407 0.350036
DistanceFromHome 0.000000 0.250000 0.035714 0.071429 0.035714
Education 0.250000 0.000000 0.250000 0.750000 0.000000
EnvironmentSatisfaction 0.333333 0.666667 1.000000 1.000000 0.000000
... ... ... ... ... ...
HourlyRate_bins_(50, 60] 0.000000 0.000000 0.000000 1.000000 0.000000
HourlyRate_bins_(60, 70] 0.000000 1.000000 0.000000 0.000000 0.000000
HourlyRate_bins_(70, 80] 0.000000 0.000000 0.000000 0.000000 0.000000
HourlyRate_bins_(80, 90] 0.000000 0.000000 0.000000 0.000000 0.000000
HourlyRate_bins_(90, 100] 1.000000 0.000000 1.000000 0.000000 0.000000

74 rows × 5 columns

4. Segmentation

4.1 Segmentação attrition='No'

In [508]:
df_seg_att=df.loc[df['Attrition'] == 'Yes']

4.1.1 Job Role

In [509]:
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobRole'] 

Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):

    subset_counts = df_seg_att[var].value_counts().tolist()
    print(subset_counts)
    subset_values = df_seg_att[var].value_counts().keys().tolist()
    
           
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='green', data=df_new,edgecolor='k',
                     order=df_new.sort_values('subset_counts',ascending = False).subset_values)
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_width().round(2)) 
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+3.5,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_xlim(0,50)
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show() 
[62, 57, 47, 33, 12, 10, 9, 5, 2]

4.1.2 Stock option level

In [510]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['StockOptionLevel'] 

Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,4), dpi=80)
for i,var in enumerate(variaveis):
    subset_counts = df_seg_att[var].value_counts().tolist()
    subset_values = df_seg_att[var].value_counts().keys().tolist()
    
    total_counts=[]
      
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt +  ' %', ha='center', va='top',fontsize=16,color='k')
    
    ax.set_ylabel('Percentage of employees in attrition ')
    
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 
    
plt.tight_layout()
plt.show()   

4.1.3 Job level

In [511]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobLevel'] 

Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
    subset_counts = df_seg_att[var].value_counts().tolist()
    subset_values = df_seg_att[var].value_counts().keys().tolist()
    
    total_counts=[]
        
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt +  ' %', ha='center', va='bottom',fontsize=16,color='k')
    
    ax.set_ylabel('Percentage of employees in attrition ')
    
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 
    
plt.tight_layout()
plt.show()   

4.1.4 Job involvement

In [512]:
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobInvolvement'] 


Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
    subset_counts = df_seg_att[var].value_counts().tolist()
    subset_values = df_seg_att[var].value_counts().keys().tolist()
    
    total_counts=[]
      
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt +  ' %', ha='center', va='bottom',fontsize=16,color='k')
    
    ax.set_ylabel('Percentage of employees in attrition ')
    
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 
    
plt.tight_layout()
plt.show()   

4.1.5 Business Travel

In [513]:
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
variaveis = ['BusinessTravel'] 


Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,3))
for i,var in enumerate(variaveis):

    subset_counts = df_seg_att[var].value_counts().tolist()
    print(subset_counts)
    subset_values = df_seg_att[var].value_counts().keys().tolist()
     
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='green', data=df_new,edgecolor='k',
                     order=df_new.sort_values('subset_counts',ascending = False).subset_values)
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_width().round(2)) 
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+6,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_xlim(0,80)
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    
plt.tight_layout()
plt.show() 
[156, 69, 12]

4.1.6 OverTime

In [514]:
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
variaveis = ['OverTime'] 


Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,3))
for i,var in enumerate(variaveis):
    subset_counts = df_seg_att[var].value_counts().tolist()
    print(subset_counts)
    subset_values = df_seg_att[var].value_counts().keys().tolist()
  
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(y=subset_values, x=subset_counts, color='green', data=df_new,edgecolor='k',
                     order=df_new.sort_values('subset_counts',ascending = False).subset_values)
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_width().round(2)) 
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+6,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
    ax.set_xlabel('Percentage of employees in attrition ')
    ax.set_xlim(0,80)
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=45) 
    

plt.tight_layout()
plt.show() 
[127, 110]

4.1.7 Training times last year

In [515]:
plt.rcParams['font.size'] = 9
plt.rcParams['figure.dpi'] = 100
variaveis = ['TrainingTimesLastYear'] 

Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)

fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,4), dpi=100)
for i,var in enumerate(variaveis):
    subset_counts = df_seg_att[var].value_counts().tolist()
    subset_values = df_seg_att[var].value_counts().keys().tolist()
    
    total_counts=[]
    
    for k,j in enumerate(subset_counts):
        subset_counts[k]= subset_counts[k]/Total*100 
        subset_counts[k]=round(subset_counts[k],1)
    
    df_new = pd.DataFrame({"subset_counts":subset_counts,
                  "subset_values":subset_values})
        
    ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
    patch_h = []    
    for patch in ax.patches:
        reading = patch.get_height()
        patch_h.append(reading)
   
    idx_tallest = np.argmax(patch_h)   
    # np.argmax return the index of largest value of the list

    ax.patches[idx_tallest].set_facecolor('lightgreen')  
    values=df_new.sort_values('subset_counts',ascending = False).subset_values
    
    for p in ax.patches:
        txt = str(p.get_height().round(2)) 
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt +  ' %', ha='center', va='bottom',fontsize=12,color='k')
    
    ax.set_ylabel('Percentage of employees in attrition ')
    
    ax.set_title(var)
    
    _ = plt.setp(ax.get_xticklabels(), rotation=0) 
    
plt.tight_layout()
plt.show() 

4.2 Segmentation by job Role

4.2.1 Sales representative (considering attrition)

4.2.1.1 Gender

In [516]:
var='Gender'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %", 
            #Used to format it K representation
            color='w', 
            rotation='horizontal', 
            size='medium', ha='center')    

ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %", 
            #Used to format it K representation
            color='w', 
            rotation='horizontal', 
            size='medium',ha='center')

plt.show()

4.2.1.2 Overtime

In [517]:
var='OverTime'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %", 
            #Used to format it K representation
            color='k', 
            rotation='horizontal', 
            size='medium', ha='center')    
    
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %", 
            #Used to format it K representation
            color='k', 
            rotation='horizontal', 
            size='medium',ha='center')
plt.show()

4.2.1.3 Montly Income

In [518]:
var='MonthlyIncome' 

collumn=df[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
dataframe=df.loc[df['JobRole'] == 'Sales Representative']

lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(500,7000)  
axes[0,1].set_xlim(500,7000)  
axes[1,0].set_xlim(500,7000)  
axes[1,1].set_xlim(500,7000)  

plt.xlabel(var)

plt.show()

4.2.1.4 Job Level

In [519]:
var='JobLevel'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.1.5 Stock option level

In [520]:
var='StockOptionLevel'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.1.6 Satisfaction

In [521]:
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance'] 
var='JobSatisfaction'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.1.7 Business Travel

In [522]:
var='BusinessTravel'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = 8,6

g = sns.catplot(y=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu',edgecolor='k')
ax = g.facet_axis(0,0)
    
for p in ax.patches:
        txt = str(p.get_width()) 
        txt = txt.strip(".0")
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+3.5,txt_y+0.6,txt, ha='center', va='bottom',fontsize='13')

        rects = ax.patches
ax = g.facet_axis(0,1)
for p in ax.patches:
        txt = str(p.get_width())  
        
        txt_x = p.get_width() 
        txt_y = p.get_y()
        
        plt.text(txt_x+3.5,txt_y+0.6,txt, ha='center', va='bottom',fontsize='13')

        rects = ax.patches
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()
posx and posy should be finite values
posx and posy should be finite values

4.2.1.8 Years in current Role

In [523]:
var='YearsInCurrentRole' 

collumn=df[var]

plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

maxim=collumn.max()

minim=collumn.min()

dataframe=df.loc[df['JobRole'] == 'Sales Representative']

lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (5,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].set_ylabel('')  
axes[0,0].set_xlim(minim-1,maxim)  
axes[0,1].set_xlim(minim-1,maxim)  
axes[1,0].set_xlim(minim-1,maxim)  
axes[1,1].set_xlim(minim-1,maxim)  

plt.xlabel(var)

plt.show()

4.2.2 Laboratory Technician (considering attrition)

4.2.2.1 Montly Income

In [524]:
var='MonthlyIncome' 

dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']

collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(500,maxim+500)  
axes[0,1].set_xlim(500,maxim+500)  
axes[1,0].set_xlim(500,maxim+500)  
axes[1,1].set_xlim(500,maxim+500)  

plt.xlabel(var)


plt.show()

4.2.2.2 Years in current role

In [525]:
var='YearsInCurrentRole' 

dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']

collumn=dataframe[var]

plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100

maxim=collumn.max()

minim=collumn.min()

lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(minim-1,maxim+2)  
axes[0,1].set_xlim(minim-1,maxim+2)  
axes[1,0].set_xlim(minim-1,maxim+2)  
axes[1,1].set_xlim(minim-1,maxim+2)  

plt.xlabel(var)


plt.show()

4.2.2.3 Years at company

In [526]:
var='YearsAtCompany' 

dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']

collumn=dataframe[var]

plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100

maxim=collumn.max()

minim=collumn.min()

lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(minim-1,maxim+2)  
axes[0,1].set_xlim(minim-1,maxim+2)  
axes[1,0].set_xlim(minim-1,maxim+2)  
axes[1,1].set_xlim(minim-1,maxim+2)  

plt.xlabel(var)


plt.show()

4.2.2.4 Stock option level

In [527]:
var='StockOptionLevel'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()


plt.show()

4.2.2.5 Job Level

In [528]:
var='JobLevel'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.2.6 Over Time

In [529]:
var='OverTime'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]   

soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.2.7 Work Life balance

In [530]:
var='WorkLifeBalance'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.2.8 Satisfaction

In [531]:
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance'] 
var='JobInvolvement'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.3 Human resources (considering attrition)

4.2.3.1 Montly Income

In [532]:
var='MonthlyIncome' 

dataframe=df.loc[df['JobRole'] == 'Human Resources']

collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()

lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(500,maxim+500)  
axes[0,1].set_xlim(500,maxim+500)  
axes[1,0].set_xlim(500,maxim+500)  
axes[1,1].set_xlim(500,maxim+500)  

plt.xlabel(var)


plt.show()

4.2.3.2 Over time

In [533]:
var='OverTime'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]   

soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.3.3 Training times last year

In [534]:
var='TrainingTimesLastYear' 

dataframe=df.loc[df['JobRole'] == 'Human Resources']

collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()

lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(0,maxim+3)  
axes[0,1].set_xlim(0,maxim+3)  
axes[1,0].set_xlim(0,maxim+3)  
axes[1,1].set_xlim(0,maxim+3)  

plt.xlabel(var)


plt.show()

4.2.3.4 Stock option level

In [535]:
var='StockOptionLevel'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    txt = str(p.get_height()) 
    txt = txt.strip(".0")
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
            txt, 
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')

ax = g.facet_axis(0,1)
for p in ax.patches:
    txt = str(p.get_height()) 
    txt = txt.strip(".0")
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
            txt, color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()
posx and posy should be finite values
posx and posy should be finite values

4.2.3.5 Job Level

In [536]:
var='JobLevel'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    txt = int(p.get_height()) 
    
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
            txt, 
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
 
ax = g.facet_axis(0,1)
for p in ax.patches:
    txt = (p.get_height()).astype('Int64') 
    
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
            txt, color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()
posx and posy should be finite values
posx and posy should be finite values

4.2.3.6 Distance from home

In [537]:
var='DistanceFromHome' 

dataframe=df.loc[df['JobRole'] == 'Human Resources']

collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(0,maxim+2)  
axes[0,1].set_xlim(0,maxim+2)  
axes[1,0].set_xlim(0,maxim+2)  
axes[1,1].set_xlim(0,maxim+2)  

plt.xlabel(var)

plt.show()

4.2.3.7 Satisfaction

In [538]:
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance'] 
var='EnvironmentSatisfaction'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100

g = sns.catplot(x=var, col="Attrition", col_wrap=4,
                data=dataframe,
                kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()),   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
    ax.text(p.get_x()+ p.get_width()/2, 
            p.get_height() * 1.02, 
           (p.get_height()).astype('Int64') ,   #Used to format it K representation
            color='black', 
            rotation='horizontal', 
            size='medium',ha='center')
    
subset_df = dataframe[dataframe[var] == 1]    
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()

plt.show()

4.2.3.8 Percent Salary hike

In [539]:
var='PercentSalaryHike' 

dataframe=df.loc[df['JobRole'] == 'Human Resources']

collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']    

fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)

g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )

g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])

axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
       color= 'k')

plt.xlim(minim,maxim)

plt.xlabel(var)

lista = dataframe[dataframe['Attrition'] == 'No']      

g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)

g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )

g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])

axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
       color= 'k')

axes[1,1].set_ylabel('')  

axes[0,0].set_xlim(0,maxim+2)  
axes[0,1].set_xlim(0,maxim+2)  
axes[1,0].set_xlim(0,maxim+2)  
axes[1,1].set_xlim(0,maxim+2)  

plt.xlabel(var)


plt.show()